import folium
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error
import numpy as np
from math import pi
from sklearn import linear_model
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import MultinomialNB
from sklearn.neural_network import MLPClassifier
from sklearn.feature_selection import RFE
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OrdinalEncoder
import seaborn as sns;
import matplotlib.pylab as plt
%matplotlib inline
location = pd.read_csv('files/location.csv')
road_casualities = pd.read_csv('files/roadcasualties.csv')
driver_demographics = pd.read_csv('files/driverdemographics.csv')
restraints = pd.read_csv('files/restrainthelmetuse.csv')
vehicles = pd.read_csv('files/vehicleinvolvement.csv')
factors = pd.read_csv('files/factorsinroadcrashes.csv')
population = pd.read_csv('files/Queensland_population.csv')
vehicle_reg = pd.read_csv('files/Queensland_vehicle_registration.csv')
Everything around us can be represented and understood through numbers and everything has pattern, even road accidents. In this project, we are working with Queensland’s Road Crash data from 2001-2017. The main goal of this project is analysing the data, finding patterns and trying to predict if certain factors are responsible for crashes.
Unlike other projects, this project focuses more on visualising, analysing data and finding patterns than predicting.
Using Machine Learning to predict responsible factors for crashes.
This dataset contains information on crashes reported to the police which resulted from the movement of at least 1 road vehicle on a road or road related area.
All datasets have 17 years of data from 2001 to 2017 except the location dataset. location dataset had too much data (over 400k rows) which exceeds GitHub's single file's size restriction. So location data had to be truncated with command prompt and now it has 10 years of data from 2008 to 2017.
| Dataframe | Year Range | Source |
|---|---|---|
| location | 2008 - 2017 | Open Data Australia |
| road_casualities | 2001 - 2017 | Open Data Australia |
| driver_demographics | 2001 - 2017 | Open Data Australia |
| restraints | 2001 - 2017 | Open Data Australia |
| vehicles | 2001 - 2017 | Open Data Australia |
| factors | 2001 - 2017 | Open Data Australia |
| population | 2001 - 2017 | Population Australia |
| vehicle_reg | 2001 - 2017 | Department of Transport and Main Roads |
We will print heads of all dataframes which will show first 5 rows of each dataframes.
location.head()
road_casualities.head()
driver_demographics.head()
restraints.head()
vehicles.head()
factors.head()
population.head()
vehicle_reg.head()
These functions are used throughout the entire project in variaous situations.
#Converts given dataframe's categorical labels into numbers using OrdinalEncoder()
def ordinal_encoder(_data):
data = _data.copy()
enc = OrdinalEncoder()
for x in data.columns:
data[x] = enc.fit_transform(data[[x]])
return data
#Returns crash year with casualty count from the given dataframe in a dictionary
def year_count(data):
yearWithCount = {}
for i in range(0, data.shape[0]):
yearWithCount[data.iloc[i]['Crash_Year']] = 0
for i in range(0, data.shape[0]):
yearWithCount[data.iloc[i]['Crash_Year']] = yearWithCount[data.iloc[i]['Crash_Year']] + data.iloc[i]['Casualty_Count']
return yearWithCount
#Returns month with casualty count from the given dataframe in a dictionary
def month_count(data):
monthWithCount = {}
for i in range(0, data.shape[0]):
monthWithCount[data.iloc[i]['Crash_Month']] = 0
for i in range(0, data.shape[0]):
monthWithCount[data.iloc[i]['Crash_Month']] = monthWithCount[data.iloc[i]['Crash_Month']] + data.iloc[i]['Count_Casualty_Total']
return monthWithCount
#Returns day with casualty count from the given dataframe in a dictionary
def day_count(data):
dayWithCount = {}
for i in range(0, data.shape[0]):
dayWithCount[data.iloc[i]['Crash_Day_Of_Week']] = 0
for i in range(0, data.shape[0]):
dayWithCount[data.iloc[i]['Crash_Day_Of_Week']] = dayWithCount[data.iloc[i]['Crash_Day_Of_Week']] + data.iloc[i]['Count_Casualty_Total']
return dayWithCount
#Returns road user type with casualty count from the given dataframe in a dictionary
def cas_type(data):
cas_typeWithCount = {}
for i in range(0, data.shape[0]):
cas_typeWithCount[data.iloc[i]['Casualty_Road_User_Type']] = 0
for i in range(0, data.shape[0]):
cas_typeWithCount[data.iloc[i]['Casualty_Road_User_Type']] = cas_typeWithCount[data.iloc[i]['Casualty_Road_User_Type']] + data.iloc[i]['Casualty_Count']
return cas_typeWithCount
#Returns a dictionary of dictionaries which have casualty count for each day and hour from the given dataframe
def heatGen(data):
dic = dict()
days_lst = ['Saturday', 'Sunday', 'Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday']
time_lst = [x for x in range(0,24)]
for i in range(0, len(time_lst)):
for x in days_lst:
dic[i]={}
for i in range(0, len(time_lst)):
for x in days_lst:
dic[i][x]=0
for i in range(0, data.shape[0]):
dic[data.iloc[i]['Crash_Hour']][data.iloc[i]['Crash_Day_Of_Week']] += 1
return dic
#Getting 2017's data from the whole dataframe
location_2017 = location[location.Crash_Year==2017].copy()
#Initialising the map with default coordinates
map_loc = folium.Map(
location=[-27.470214, 153.022788],
tiles='Stamen Toner',
zoom_start=12
)
#Looping through the dataframe
for i in range(0, location_2017.shape[0]):
#If its a fatal crash then mark it death
if (location_2017.iloc[i].Crash_Severity == 'Fatal'):
folium.Circle(
radius=2,
location=[location_2017.iloc[i].Crash_Latitude_GDA94, location_2017.iloc[i].Crash_Longitude_GDA94],
popup='Death',
color='crimson',
fill=False,
).add_to(map_loc)
#If its not a fatal crash then mark it injured
else:
folium.Circle(
radius=2,
location=[location_2017.iloc[i].Crash_Latitude_GDA94, location_2017.iloc[i].Crash_Longitude_GDA94],
popup='Injured',
color='gold',
fill=False,
).add_to(map_loc)
Zoom in or out for more details
Note: The map does not show up on GitHub output view. it's because, the map is running through a JavaScript map backend and GitHub output view blocks JavaScript. Please use the HTML file to view the notebook or use the jupyter notebook file.
map_loc